{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import joblib \n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1.从特征重要性最高的前100个特征，"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "COL_NUM = 100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "def timer(func):\n",
    "    def wrapper(*args,**kwargs):\n",
    "        start = time.time()\n",
    "        result = func(*args,**kwargs)\n",
    "        end = time.time()\n",
    "        print(func.__name__+'运行时间：',end-start)\n",
    "        return result\n",
    "    return wrapper"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "f_score = joblib.load('./1.get_raw_fscore/f_score')\n",
    "\n",
    "input_dir = '../../../preprocess_data_new/'\n",
    "data_date = joblib.load(input_dir + 'train_ax_date.lz4')[:33465]\n",
    "data_nodup =joblib.load(input_dir + 'train_ax_nodup.lz4').drop(columns=['id','loan_dt'])[:33465]\n",
    "data_null = joblib.load(input_dir + 'train_ax_null.lz4')[:33465]\n",
    "# 为了迁就valid中tag特征append到最后的格式\n",
    "data_tag = data_nodup[['tag']] \n",
    "data_nodup = data_nodup.drop(columns=['tag'])\n",
    "data = pd.concat([data_date,data_nodup,data_null,data_tag],axis=1)\n",
    "\n",
    "x = data.fillna(-1).values\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def filter_uniq(df,uniq_num=100):\n",
    "    '''uniq_num < 100的不要'''\n",
    "    col_list =[col for col in df.columns if len(df[col].unique())>uniq_num]\n",
    "    return df[col_list]\n",
    "def filter_dup(df,percent=80):\n",
    "    '''重复过多的不要'''\n",
    "    thre = percent*df.shape[0]\n",
    "    col_list =[col for col in df.columns if df[col].value_counts().values[0]<thre]\n",
    "    return df[col_list]\n",
    "def gen_data(df,col_num=50):\n",
    "    df_fs = df[f_score[:col_num].index]\n",
    "    df_uniq = filter_uniq(df_fs,100)\n",
    "    df_nodup = filter_dup(df_uniq,80)\n",
    "    return df_nodup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_sel = gen_data(data,COL_NUM)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['data_sel.lz4']"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "joblib.dump(data_sel,'data_sel.lz4',compress='lz4')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### multiply——做乘法交叉"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "from joblib import Parallel, delayed\n",
    "\n",
    "def one_cross(df,col_0):\n",
    "    data_cf = pd.DataFrame()\n",
    "    for col_1 in df.columns:\n",
    "        data_cf['{0}*{1}'.format(col_0,col_1)] = (df[col_0]*df[col_1]).values\n",
    "    return data_cf\n",
    "\n",
    "@timer\n",
    "def parallel_cross(df):\n",
    "    df_list = Parallel(n_jobs=32,verbose=10)(delayed(one_cross)(df,col_0) for col_0 in df.columns)\n",
    "    return pd.concat(df_list,axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_sel = joblib.load('data_sel.lz4')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.\n",
      "[Parallel(n_jobs=32)]: Done   8 tasks      | elapsed:    0.6s\n",
      "[Parallel(n_jobs=32)]: Done  21 tasks      | elapsed:    1.3s\n",
      "[Parallel(n_jobs=32)]: Done  32 out of  86 | elapsed:    2.2s remaining:    3.6s\n",
      "[Parallel(n_jobs=32)]: Done  41 out of  86 | elapsed:    2.7s remaining:    3.0s\n",
      "[Parallel(n_jobs=32)]: Done  50 out of  86 | elapsed:    3.2s remaining:    2.3s\n",
      "[Parallel(n_jobs=32)]: Done  59 out of  86 | elapsed:    3.8s remaining:    1.7s\n",
      "[Parallel(n_jobs=32)]: Done  68 out of  86 | elapsed:    4.3s remaining:    1.1s\n",
      "[Parallel(n_jobs=32)]: Done  77 out of  86 | elapsed:    4.8s remaining:    0.6s\n",
      "[Parallel(n_jobs=32)]: Done  86 out of  86 | elapsed:    5.3s remaining:    0.0s\n",
      "[Parallel(n_jobs=32)]: Done  86 out of  86 | elapsed:    5.3s finished\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "parallel_cross运行时间： 8.780004262924194\n"
     ]
    }
   ],
   "source": [
    "data_cf = parallel_cross(data_sel)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['data_cf.lz4']"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "joblib.dump(data_cf,'data_cf_{}.lz4'.format(data_sel.shape[0]),compress='lz4')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
